# Data manipulation
import pandas as pd
import numpy as np
from numpy import linalg
# Support libs
import glob
import json
import datetime
from scipy import stats
import os
import requests
import subprocess
import re
import random
# Plotting
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
from matplotlib.patches import Ellipse
import seaborn as sns
# Statistical tools
from scipy.spatial.distance import squareform, pdist
from scipy import stats
from scipy.stats import gamma, pearsonr
from scipy import signal as sp_signal
from sklearn import ensemble
from sklearn.cluster import DBSCAN
from shapely.geometry import MultiPoint
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_samples, silhouette_score
import statsmodels.api as sm
### Zip Code database API
from uszipcode import ZipcodeSearchEngine
# Display full column width
pd.set_option('display.max_colwidth', -1)
from IPython.display import display, HTML
import plotly
from plotly import tools
import plotly.plotly as py
import plotly.graph_objs as go
import colorlover as clv
from plotly.offline import download_plotlyjs, init_notebook_mode, iplot
from plotly.offline.offline import _plot_html
import cufflinks as cf
plotly.offline.init_notebook_mode() # run at the start of every notebook
def displayHTML(in_df):
display(HTML(in_df.to_html()))
def Lowess(data, f=2. / 3., pts=None, itn=3):
"""Lowess(s, f=2./3., pts=None, itn=3) -> yEst
Lowess smoother: Robust locally weighted regression.
The lowess function fits a nonparametric regression curve to a scatterplot.
data: (pandas.Series) contains data points in the scatterplot. The
function returns the estimated (smooth) values of y.
f: (float) the fraction of the data set to use for smoothing. A
larger value for f will result in a smoother curve.
pts: (int) Optionally, the explicit number of data points to be used for
smoothing can be passed through pts instead of f.
itn: (int) The number of robustifying iterations. The function will run
faster with a smaller number of iterations.
yEst: (pandas.Series) The return value containing the smoothed data.
"""
# Authors: Alexandre Gramfort <alexandre.gramfort@telecom-paristech.fr>
# Dan Neuman (conversion to Pandas series)
# License: BSD (3-clause)
x = np.array(data.index)
y = (data.values)
n = len(data)
if pts is None:
f = np.min([f, 1.0])
r = int(np.ceil(f * n))
else: # allow use of number of points to determine smoothing
r = int(np.min([pts, n]))
h = [np.sort(np.abs(x - x[i]))[r] for i in range(n)]
w = np.clip(np.abs((x[:, None] - x[None, :]) / h), 0.0, 1.0)
w = (1 - w ** 3) ** 3
yEst = np.zeros(n)
delta = np.ones(n)
for iteration in range(itn):
for i in range(n):
weights = delta * w[:, i]
b = np.array([np.sum(weights * y), np.sum(weights * y * x)])
A = np.array([[np.sum(weights), np.sum(weights * x)],
[np.sum(weights * x), np.sum(weights * x * x)]])
beta = linalg.solve(A, b)
yEst[i] = beta[0] + beta[1] * x[i]
residuals = y - yEst
s = np.median(np.abs(residuals))
delta = np.clip(residuals / (6.0 * s), -1, 1)
delta = (1 - delta ** 2) ** 2
return pd.Series(yEst, index=data.index, name='Trend')
def non_persistent_ts( in_df ):
window_null_ct_df = unstacked_df.apply( lambda x: x.isnull() )\
.rolling(4)\
.sum()
f = lambda x: 1 if x==4 else 0
window_null_ct_df = window_null_ct_df.applymap( f ).reset_index()
window_null_ct_df["year"] = window_null_ct_df["local_datetime"].map( lambda x: x.year )
year_ct = window_null_ct_df.groupby("year")\
.count()
year_sum = window_null_ct_df.groupby("year")\
.sum()
window_null_rates = (year_sum/year_ct).T[:-1].reset_index()
drop_list = list(window_null_rates[window_null_rates[2016]>.2]["geo_cluster"])
keep_list = [i for i in unstacked_df.columns if i not in drop_list]
return keep_list, drop_list, window_null_rates
def treat_ts( in_df, keep_list ):
# drop the bad columns
in_df = in_df[keep_list]
# interpolate the missing records
return in_df.loc[:,0:].apply(lambda x: x.interpolate(method='spline', order=3))
def smoother( in_df ):
# lowess lamda
f = lambda x: Lowess(x, f=2. / 3., pts=None, itn=3)
# drop index and apply the lowess
smoother_df = in_df.reset_index(level=[0], drop=True).fillna(0).copy()
smoother_df = smoother_df.apply(f, axis=0)
smoother_df["local_datetime"] = unstacked_df.index
smoother_df = smoother_df.set_index("local_datetime")
return smoother_df
def detrend( smoother_df, unstacked_df ):
smoother_df["local_datetime"] = unstacked_df.index
smoother_df = smoother_df.set_index("local_datetime")
# residuals_df = unstacked_df[target_with_neighbors]-smoother_df
residuals_df = unstacked_df-smoother_df
sq_residuals_df = residuals_df.apply(lambda x: x**2)
return residuals_df, sq_residuals_df
def cluster_ts( input_series ):
X = pd.DataFrame(input_series.fillna(0))
# Compute DBSCAN
db = DBSCAN(\
eps=.9\
, min_samples=4\
, leaf_size=30\
, metric="euclidean"\
).fit(X)
core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
core_samples_mask[db.core_sample_indices_] = True
return db.labels_
# Input Params
wd = "./../assets/data/aws-landing/"
start = "2016-01-01"
end = "2018-04-01"
# Gen Date Range
timeWindow = pd.date_range(start, end, freq=pd.tseries.offsets.DateOffset(days=1))
timeWindow
! ls ../assets/data/aws-landing/
us_data = pd.read_csv("./../assets/data/concat-data/2016-2018.csv")
us_data.head(2)
len(us_data["location"].unique())
us_data_pm25_df = us_data[us_data["parameter"]=="pm25"].copy()
print len(us_data_pm25_df["location"].unique())
us_data_pm25_df.head(3)
print """Dates are available at local time and UTC, which
normalizes for the day; however this also means that
a single file will have data that spans over two days,
due to the time zones. \n"""
print "This is the local time: {} \n".format(us_data_pm25_df["local"][0])
# print "This is the utc (universaly coordinated time): {} \n".format(us_data["utc"][0])
print "The data are formatted as string types"
print type(us_data_pm25_df["local"][0])
# pd.DataFrame(date_map_df).head(2)
date_map_df = pd.DataFrame(us_data_pm25_df["local"].drop_duplicates())
# creating a datetime format, for SARIMAX later
date_map_df["local_datetime"] = pd.to_datetime(date_map_df["local"])
# extracting the time and day
date_map_df["local_hour"] = date_map_df["local_datetime"].dt.hour
date_map_df["local_day"] = date_map_df["local_datetime"].dt.day
us_data_pm25_df = us_data_pm25_df.merge( date_map_df, on="local", how="left" )
print """ Checking the distribution of reading times to sample times
that all locations have in common. \n
We want to normalize for factors like rush hour. \n
It may be useful to think about the latency in terms of
emissions affects on the readings. \n"""
reading_time_cts = us_data_pm25_df[["local_hour","location"]].groupby("local_hour")\
.count()\
.rename(columns={"location":"count"}).reset_index()\
.sort_values(by="local_hour")
df = reading_time_cts
fig = {
'data' : [{
'type' : 'bar',
'y' : df["count"].tolist(),
'x' : df.index.tolist(),
'orientation' : 'v'
}],
'layout' : {
"title": "Count of Readings by Hour"
, 'xaxis': {'title': 'Hour of Day'}
, 'yaxis': {'title': "Count"}
}
}
iplot(fig, config={"displaylogo":False, 'displayModeBar': False}, show_link=False)
print """There are {} unique locations.
We are going to find a time of day that has this number of unique locations.
We will need to think about whether or not the times coincide with metrics,
if we are to look at multivariate distributions.
If we can't find an exact time, then we might have to choose bands.
""".format(len(us_data_pm25_df["location"].unique()))
# TODO: Change this to two hour windows and take the max
us_data_pm25_df = us_data_pm25_df[us_data_pm25_df["local_hour"].isin([10,18])]
# us_data_pm25_df.to_csv("../assets/data/concat-data/2016-2018-us-pm25.csv")
# us_data_pm25_df = pd.read_csv("../assets/data/concat-data/2016-2018-us-pm25.csv")
We're using k-means here because it produces spheroidal shapes which we will transform into networks to understand how cities impact eachother and covary with respect to pm2.5 levels
Another way to think about this is that we are performing a variation of hierarchical clustering, except with two steps that only consider the existing centroid
# Create a DataFrame of Unique Location Coordinates of ALL sensors in the US
coords_df = us_data_pm25_df[['location','latitude', 'longitude']].drop_duplicates()\
.reset_index()\
.drop("index",axis=1)
# Prep the Coordinate DF for a distance matrix
coords = coords_df.as_matrix(columns=['latitude', 'longitude'])
print len(coords)
print len(coords_df)
cf.set_config_file(offline=True, world_readable=True, theme='pearl')
df = coords_df[\
(coords_df["longitude"].between(-145, -60))\
&(coords_df["latitude"].between(20, 55))]\
[["latitude","longitude"]]
df["geo_cluster"] = 1
fig = {
'data': [
{
'x': df[df['geo_cluster']==geo_cluster]['longitude'],
'y': df[df['geo_cluster']==geo_cluster]['latitude'],
'name': geo_cluster, 'mode': 'markers',
} for geo_cluster in df["geo_cluster"].unique()
],
'layout': {
"title": "Air Quality Sensors in the US"
, 'xaxis': {'title': 'Longitude'}
, 'yaxis': {'title': "Latitude"}
}
}
plotly.offline.iplot(fig)
# create feature vectors to train a kmeans model
locations = coords_df['location'].values
f1 = coords_df['latitude'].values
f2 = coords_df['longitude'].values
X = np.array(list(zip(f1, f2)))
# Looping through silhouette scores to find an optimal compactness and isolation
cluster_range = range( 30, 85 )
cluster_errors = []
cluster_sil_means = []
for num_clusters in cluster_range:
clusters = KMeans( num_clusters, random_state=0 )
clusters.fit( X )
cluster_errors.append( clusters.inertia_ )
cluster_labels = clusters.fit_predict( X )
silhouette_avg = silhouette_score( X, cluster_labels )
cluster_sil_means.append( silhouette_avg )
clusters_df = pd.DataFrame( {\
"num_clusters":cluster_range\
, "cluster_errors": cluster_errors
, "mean_sil": cluster_sil_means } )
plotly.offline.iplot({
"data": [{
"x": clusters_df.num_clusters,
"y": clusters_df.cluster_errors
}],
"layout": {
"title": "Inertia by Number of Geo Clusters"
, "xaxis": {"title":"K Clusters"}
, "yaxis": {"title":"SSE"}
}
}, config={"displaylogo":False, 'displayModeBar': False}, show_link=False)
plotly.offline.iplot({
"data": [{
"x": clusters_df.num_clusters,
"y": clusters_df.mean_sil
}],
"layout": {
"title": "Mean Silhouette Score by Number of Geo Clusters"
, "xaxis": {"title":"K Clusters"}
, "yaxis": {"title":"Silhoutte Score"}
}
}, config={"displaylogo":False, 'displayModeBar': False}, show_link=False)
sil_info = clusters_df[clusters_df.mean_sil==clusters_df.mean_sil.max()]
k = sil_info.num_clusters.values[0]
print "At k={0} the silhouette score is maximized at {1}".format(\
k\
, sil_info.mean_sil.values[0] )
# Number of clusters
kmeans = KMeans(n_clusters=k, random_state=0)
# Fitting the input data
kmeans = kmeans.fit(X)
# Getting the cluster labels
labels = kmeans.predict(X)
# Centroid values
centroids = kmeans.cluster_centers_
# centroids
coords_df["geo_cluster"] = labels
# group the cluster counts and plot their frequencies
cluster_counts = coords_df[["geo_cluster","location"]].groupby("geo_cluster")\
.count()\
.rename(columns={"location":"count"})\
.sort_values(by="count", ascending=False)\
.reset_index()
df = cluster_counts
fig = {
'data' : [{
'type' : 'bar',
'y' : df["count"].tolist(),
'x' : df.index.tolist(),
'orientation' : 'v'
}],
'layout' : {
"title": "Count of Sensors by Cluster"
, 'xaxis': {'title': 'Cluster Label'}
, 'yaxis': {'title': "Count"}
}
}
iplot(fig, config={"displaylogo":False, 'displayModeBar': False}, show_link=False)
cmap = []
for i in range(len(coords_df["geo_cluster"].unique())/6):
cmap.append([j for j in range(2,8)])
flattened_cmap = [y for x in cmap for y in x]
color_dict = dict(zip(coords_df["geo_cluster"].unique(),flattened_cmap))
cf.set_config_file(offline=True, world_readable=True, theme='pearl')
bupu = clv.scales['9']['seq']['PuBu']
df = coords_df[\
(coords_df["longitude"].between(-145, -60))\
&(coords_df["latitude"].between(20, 55))]\
[["latitude","longitude","geo_cluster"]]
fig = {
'data': [
{
'x': df[df['geo_cluster']==geo_cluster]['longitude']
, 'y': df[df['geo_cluster']==geo_cluster]['latitude']
, 'name': geo_cluster, 'mode': 'line'
, "marker":dict( color=bupu[color_dict[geo_cluster]])
} for geo_cluster in df["geo_cluster"].unique()
],
'layout': {
"title": "Sensors in the US by Geospatial Cluster"
, 'xaxis': {'title': 'Longitude'}
, 'yaxis': {'title': "Latitude"}
, "shapes": [
{
'type': 'circle',
'xref': 'x',
'yref': 'y',
'x0': min(df[df['geo_cluster']==geo_cluster]['longitude']),
'y0': min(df[df['geo_cluster']==geo_cluster]['latitude']),
'x1': max(df[df['geo_cluster']==geo_cluster]['longitude']),
'y1': max(df[df['geo_cluster']==geo_cluster]['latitude']),
'opacity': 0.2,
'fillcolor': bupu[color_dict[geo_cluster]],
"line":dict( color=bupu[color_dict[geo_cluster]])
} for geo_cluster in df["geo_cluster"].unique()
]
}
}
plotly.offline.iplot(fig, config={"displaylogo":False, 'displayModeBar': False}, show_link=False)
# Merge Geo Clusters in on lat lng for precision
working_df = us_data_pm25_df.merge(\
coords_df[["latitude","longitude","location","geo_cluster"]]\
, on=["latitude","longitude","location"], how="left" )
working_df.head(3)
"{} pct null labels".format(\
len(working_df[working_df["geo_cluster"].isnull()])\
/(len(working_df)*1.00))
# Calculate the mid point
midpoints_df = working_df[[\
"geo_cluster"\
, "latitude"\
, "longitude"\
]].groupby("geo_cluster")\
.mean()\
.reset_index()
# Calculate a distance matrix
dist_matrix = pd.DataFrame(\
squareform(pdist( midpoints_df.iloc[:, 1:] ))\
, columns= midpoints_df.geo_cluster.unique()\
, index= midpoints_df.geo_cluster.unique() )
# 1 degree is about 111km
dist_matrix_km = dist_matrix*111
def flag_neighbor(x):
if (int(x) !=0) & (int(x) <=400):
return 1
else:
return 0
dist_matrix_dummies = dist_matrix_km.applymap(flag_neighbor)\
.reset_index()\
.rename(columns={"index":"geo_cluster"})
print "Now we have a list of all of the clusters as well as the distance matrix transformed:"
print dist_matrix_dummies["geo_cluster"].unique()
# hot encoding for neighbors
dist_matrix_dummies.head()
# Create a dictionary that contains a list of neighbors for each cluster
# We can pass this in as a filter to make it easier to map model fit logic later
neighbor_dict = {}
# Loop through the unique clusters
for cluster in dist_matrix_dummies["geo_cluster"].unique():
# Transpose the row into a column vector
neighbor_flags = dist_matrix_dummies[dist_matrix_dummies["geo_cluster"]==cluster].iloc[:, 1:].T
neighbor_flags = neighbor_flags.rename(\
columns={\
neighbor_flags.columns.values[0]:"neighbor_ind"\
})\
# When there is a neighbor flag, take the index and store it in a dictionary
neighbor_list = list(neighbor_flags[neighbor_flags["neighbor_ind"]==1].index)
neighbor_dict[cluster] = neighbor_list
neighbor_dict
# pick a single cluster
target = 50
# fetch the neighbors of that cluster
target_neighbors = neighbor_dict[target]
# create a dataframe that contains a label for neighbors of that cluster
filterd_df = working_df[['geo_cluster','latitude', 'longitude']].drop_duplicates().copy()
filterd_df["filter_cluster"] = "Not in Network"
filterd_df.loc[(filterd_df["geo_cluster"]==target),"filter_cluster"] = "Cluster No. {0}".format(target)
filterd_df.loc[(filterd_df["geo_cluster"].isin(target_neighbors)),"filter_cluster"] = "In Network"
cmap = ["0", "1", "2"]
color= filterd_df['filter_cluster'].astype(str)
labels = filterd_df["filter_cluster"].unique()
labels
print """
We have just created a dictionary that has a list of neighboring clusters for
each cluster.
- We should see a ring surrounding the target cluster below if we choose a
color mapping with three colors and assign them to:
1. the target cluster
2. the neighbors in the target cluster
3. all clusters that are not neighbors
"""
cf.set_config_file(offline=True, world_readable=True, theme='pearl')
df = filterd_df[\
(filterd_df["longitude"].between(-145, -60))\
&(filterd_df["latitude"].between(20, 55))]\
[["latitude","longitude","filter_cluster"]]
fig = {
'data': [
{
'x': df[df['filter_cluster']==labels[1]]['longitude'],
'y': df[df['filter_cluster']==labels[1]]['latitude'],
'name': labels[1], 'mode': 'markers',
"line":dict( color=bupu[3])
},
{
'x': df[df['filter_cluster']==labels[0]]['longitude'],
'y': df[df['filter_cluster']==labels[0]]['latitude'],
'name': labels[0], 'mode': 'markers',
"line":dict( color=bupu[5])
},
{
'x': df[df['filter_cluster']==labels[2]]['longitude'],
'y': df[df['filter_cluster']==labels[2]]['latitude'],
'name': labels[2], 'mode': 'markers',
"line":dict( color=bupu[7])
}
],
'layout': {
"title": "Sensors in the US by Geospatial Cluster: Emphasis on Cluster No. {0}".format(target)
, 'xaxis': {'title': 'Longitude'}
, 'yaxis': {'title': "Latitude"}
}
}
plotly.offline.iplot(fig, config={"displaylogo":False, 'displayModeBar': False}, show_link=False)
print "there are dups, even when controlling for location, date, and metric"
location_reading_ct = us_data_pm25_df[us_data_pm25_df['parameter']=="pm25"][["location","city"]].groupby("location")\
.count()\
.sort_values(by="city", ascending=False)
displayHTML( location_reading_ct.head() )
print """
some regions have readings for multiple times in the day.
we will need to normalize for this because rush hour will
probably elevate readings.
"""
displayHTML( us_data_pm25_df[(us_data_pm25_df['location']=="NCORE")&(us_data_pm25_df['parameter']=="pm25")].head(3) )
xn = location_reading_ct["city"]
gkde=stats.gaussian_kde(xn)
ind = np.linspace(0,3000,100)
kdepdf = gkde.evaluate(ind)
kdedf = pd.DataFrame(kdepdf)
print """
Below is a distribution of samples obtained by each sensor location (measuring pm25) over the entire
window of observation. There are a few interesting takeaways here:
1. the distribution is not uniform, which may be due to sensors being added over time and
therefore having less readings. This means that:
- we need to be careful when modeling the time series
- the matrix operations will necessitate equal column vector lengths, so we should be thoughtful
about imputation
2. the multimodal nature of the distribution suggests a couple of possibilities:
- a single location might have multiple readings, which when modelled, could lead to
variance inflation or biased results, deppending on if or which parametric approach
we apply
"""
plotly.offline.iplot({
"data": [{
"x": ind,
"y": kdepdf
, 'fill':'tozeroy'
}],
"layout": {
"title": "Distribution of Samples by Sensor"
, "xaxis": {"title":"Count of Samples"}
, "yaxis": {"title":"p(X=x)"}
}
}, config={"displaylogo":False, 'displayModeBar': False}, show_link=False)
- we'll do this for one time reading, so we can control for variane related to the time of day
grouped_ts_df = working_df[working_df["local_hour"]==18]\
[["geo_cluster","location","local_datetime","value"]]\
.groupby(["geo_cluster", "local_datetime"])\
.mean()\
.reset_index()\
df = grouped_ts_df[["geo_cluster","local_datetime","value"]].drop_duplicates()\
.copy()
# set a datetime64 dtype to the date, so that we can use it as an index
df.local_datetime = pd.to_datetime(df.local_datetime)
# set the index as the date field
df = df.set_index('local_datetime')
# create a generator function for groupby operation, that resamples to a 1 day
## for the entire dataset, and then unstack it into a column for each cluster
grouper = df.groupby([pd.Grouper(freq='1D'), "geo_cluster"])
unstacked_df = grouper['value'].sum().unstack('geo_cluster')
print """
The time series for the cluster below illustrates something that we need
to be mindful of:
- The measurements do not persist throughout the time series,
so if we thoughtlessly interpolated the null values, LOWESS will output
singular matrix errors.
"""
problem_cluster = 13
plotly.offline.iplot({
"data": [
{
"x": unstacked_df.index
, "y": unstacked_df[problem_cluster]
,"name": "Cluster No. {0}".format(target)
, "line":dict( color='tozeroy')
}
],
"layout": {
"title": "PM2.5 Measurements for Cluster No. {0} (Daily)".format( problem_cluster )
, "yaxis": {"title":"µg/m3"}
, "legend": dict(orientation="h")
}
}, config={"displaylogo":False, 'displayModeBar': False}, show_link=False)
#####################################################################################
################################ Test ###############################################
#####################################################################################
keep_list, drop_list, window_null_rates = non_persistent_ts( unstacked_df )
#####################################################################################
################################ Test ###############################################
#####################################################################################
print drop_list
print keep_list
displayHTML(window_null_rates[window_null_rates[2016]>.2])
displayHTML(window_null_rates[window_null_rates[2017]>.2])
displayHTML(window_null_rates[window_null_rates[2018]>.2])
#####################################################################################
################################ Test ###############################################
#####################################################################################
unstacked_df = treat_ts( unstacked_df, keep_list )
#####################################################################################
################################ Test ###############################################
#####################################################################################
print """
Now all clusters have the same time series, even if there were no readings, and this
is apparent because there are null observations if we print the beginning of the
observation window:
"""
displayHTML( unstacked_df.head(3) )
print """
In more recent readings, the data are well populated:
"""
displayHTML( unstacked_df.tail(3) )
# pick a single cluster
target = 50
# target = 29
# target = 48
# fetch the neighbors of that cluster
target_neighbors = neighbor_dict[target]
target_with_neighbors = list(set([target]).union(set(target_neighbors)))
print "The target is cluster number: {0}".format(target)
print "The target's nearest neighbors are: {0}".format(target_neighbors)
print "The target and it's neighbors are: {0}".format(target_with_neighbors)
plotly.offline.iplot({
"data": [
{
"x": unstacked_df.index
, "y": unstacked_df[target]
,"name": "Cluster No. {0}".format(target)
, "line":dict( color='tozeroy')
}
, {
"x": unstacked_df.index
, "y": unstacked_df[target_neighbors].mean(axis=1)
,"name": "Mean of clusters: {0}".format(target_neighbors)
, "line":dict( color='rgb(0,0,0)')
}
],
"layout": {
"title": "PM2.5 Measurements for Cluster No. {0} (Daily)".format(target)
, "yaxis": {"title":"µg/m3"}
, "legend": dict(orientation="h")
}
}, config={"displaylogo":False, 'displayModeBar': False}, show_link=False)
#####################################################################################
################################ Test ###############################################
#####################################################################################
smoother_df = smoother( unstacked_df )
#####################################################################################
################################ Test ###############################################
#####################################################################################
x = range(len(unstacked_df.index)+1)
y = unstacked_df[target].dropna() #.values
lamb = 129600 # parameter for 12m smoothing
cycle_annual, trend_annual = sm.tsa.filters.hpfilter(y, lamb=lamb)
hp_decompose_df = pd.DataFrame(y)
hp_decompose_df['cycle'] = cycle_annual
hp_decompose_df['trend'] = trend_annual
hp_decompose_df=hp_decompose_df.rename(columns={"trend":"HP FILTER"}).reset_index()
ewma = pd.Series.ewm
ewma_df = unstacked_df[unstacked_df[target].notnull()][target].reset_index()
x = ewma_df.index
y = ewma_df[target] #.values
df = pd.Series(y)
# take EWMA in both directions then average them
fwd = ewma(df,span=10).mean() # take EWMA in fwd direction
bwd = ewma(df[::-1],span=10).mean() # take EWMA in bwd direction
filtered = np.vstack(( fwd, bwd[::-1] )) # lump fwd and bwd together
filtered = np.mean(filtered, axis=0 ) # average
ewma_df["EWMA"] = filtered
ewma_df = ewma_df.drop(target, axis=1)
lowess = sm.nonparametric.lowess
sm_lowess_df = unstacked_df[target].dropna()
# sm_lowess_df = unstacked_df[target].fillna(0)
x=sm_lowess_df.index
y=sm_lowess_df
w = lowess(y, x, frac=1./3)
f = lambda x: x[0]
sm_lowess_df = pd.DataFrame(map(f,w[:,1:]), columns=["LOWESS"])
sm_lowess_df["local_datetime"]=x
merge_models_df = unstacked_df[target].reset_index().rename(columns={target:"EVIDENCE"})
merge_models_df = merge_models_df.merge( hp_decompose_df, on="local_datetime", how="left" )\
.merge( ewma_df, on="local_datetime", how="left" )\
.merge( sm_lowess_df, on="local_datetime", how="left" )
merge_models_df["EVIDENCE"] = unstacked_df[target].values
plot_df=merge_models_df.dropna()
merge_models_df.head()
merge_models_df["CATEGORY"] = "Model"
plot_df = merge_models_df.copy()
plot_df["EVIDENCE"] = merge_models_df["EVIDENCE"]
plot_df["HP FILTER"] = merge_models_df["EVIDENCE"]
plot_df["EWMA"] = merge_models_df["EVIDENCE"]
plot_df["LOWESS"] = merge_models_df["EVIDENCE"]
plot_df["CATEGORY"] = "Evidence"
plotly.offline.iplot({
"data": [
{
"x": merge_models_df.index
, "y": merge_models_df["EVIDENCE"]
,"name": "Observations"
, 'mode': 'markers'
, "line":dict( color=bupu[2],width=1, shape="cross")
},
{
"x": merge_models_df.index
, "y": merge_models_df["HP FILTER"]
,"name": "HP FILTER"
, "line":dict( color=bupu[8], width= 3)
}
,
{
"x": merge_models_df.index
, "y": merge_models_df["EWMA"]
,"name": "EWMA"
, "line":dict( color=bupu[6], width= 3)
}
,
{
"x": merge_models_df.index
, "y": merge_models_df["LOWESS"]
,"name": "LOWESS"
, "line":dict( color=bupu[4], width= 3)
}
],
"layout": {
"title": "PM2.5 Smoothing Parameters for Cluster No. {0} (Daily)".format(target)
, "yaxis": {"title":"µg/m3"}
, "legend": dict(orientation="h")
}
}, config={"displaylogo":False, 'displayModeBar': False}, show_link=False)
compare_resids_df =\
merge_models_df[["LOWESS","EWMA","HP FILTER"]].apply(lambda x: merge_models_df["EVIDENCE"]-x)
compare_resids_df.columns = [i+" RESIDUALS" for i in compare_resids_df.columns]
compare_resids_df = compare_resids_df.set_index(unstacked_df.index)
compare_resids_df = compare_resids_df**2
df = compare_resids_df.reset_index()
df["local_datetime"] = df["local_datetime"].astype(str)
chart_data = cf.subplots([df.figure(\
x='local_datetime'
, y="LOWESS RESIDUALS"
, colors=[bupu[5]]
),
df.figure(\
x='local_datetime'
, y="EWMA RESIDUALS"
, colors=[bupu[5]]
),\
df.figure(\
x='local_datetime'
, y="HP FILTER RESIDUALS"
, colors=[bupu[5]]
),\
],shape=(3,1), theme="white",subplot_titles=(
"LOWESS SQUARED RESIDUALS".format(target)
, "EWMA SQUARED RESIDUALS".format(target_neighbors[0])
, "HP FILTER SQUARED RESIDUALS".format(target_neighbors[1])
))
chart_data['layout'].update(showlegend=False, plot_bgcolor='#FFFFFF', paper_bgcolor='#FFFFFF')
for i in range(0,3):
chart_data['data'][i].update(mode="lines")
iplot(chart_data, config={"displaylogo":False, 'displayModeBar': False}, show_link=False)
def cluster_residuals( X ):
db = DBSCAN(\
eps=.9\
, min_samples=4\
, leaf_size=30\
, metric="euclidean"\
).fit(X)
core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
core_samples_mask[db.core_sample_indices_] = True
return db.labels_
# function to map the db scan
f = lambda x: cluster_residuals( np.array(x.fillna(0)).reshape(-1, 1) )
# apply the map to the dataframe
flagged_df = compare_resids_df.apply(f, axis=0)\
.apply(lambda x: (x==-1)*1 ).replace(0,np.nan)
# multiply the indicators for each smoothing method by the original time series for the target
outliers_ts = flagged_df.apply(lambda x: x*unstacked_df[target], axis=0)
observed_plot_df = pd.DataFrame(unstacked_df.index)
observed_plot_df["EWMA RESIDUALS"]=unstacked_df[target].values
observed_plot_df["HP FILTER RESIDUALS"]=unstacked_df[target].values
observed_plot_df["LOWESS RESIDUALS"]=unstacked_df[target].values
observed_plot_df["Y PRED"]=0
outliers_plot_df = outliers_ts.copy().reset_index()
outliers_plot_df["Y PRED"]=1
ts_cat_df = pd.concat([observed_plot_df, outliers_plot_df], axis=0)
ts_cat_df["local_datetime"]=ts_cat_df["local_datetime"].astype(str)
df =ts_cat_df
chart_data = cf.subplots(\
[df.figure(\
kind='scatter'
, mode='markers'
, x='local_datetime'
, y="EWMA RESIDUALS"
, categories="Y PRED"
, colors=['rgb(0,0,0)', 'rgb(255,0,0)']
),
df.figure(\
kind='scatter'
, mode='markers'
, x='local_datetime'
, y="HP FILTER RESIDUALS"
, categories="Y PRED"
, colors=['rgb(0,0,0)', 'rgb(255,0,0)']
),\
df.figure(\
kind='scatter'
, mode='markers'
, x='local_datetime'
, y="LOWESS RESIDUALS"
, categories="Y PRED"
, colors=['rgb(0,0,0)', 'rgb(255,0,0)']
)
],shape=(2,2), theme="white",subplot_titles=(
"EWMA + dbScan"
, "HP FILTER + dbScan"
, "LOWESS + dbScan"
))
chart_data['layout'].update(showlegend=False, plot_bgcolor='#FFFFFF', paper_bgcolor='#FFFFFF')
for i in range(0,6):
chart_data['data'][i]['marker'].update(size=5)
iplot(chart_data, config={"displaylogo":False, 'displayModeBar': False}, show_link=False)
def apply_lowess(x):
lowess = sm.nonparametric.lowess
sm_lowess_df = x.dropna()
dates_df=pd.DataFrame(unstacked_df.index)
x=sm_lowess_df.index
y=sm_lowess_df
w = lowess(y, x, frac=1./3)
f = lambda x: x[0]
output_df = pd.DataFrame(map(f,w[:,1:]), columns=["LOWESS"])
output_df["local_datetime"]=x
merged_df=dates_df.merge(output_df, on="local_datetime", how="left")
return merged_df["LOWESS"]
smoother_applied_df = unstacked_df.apply(lambda x: apply_lowess(x)).set_index(unstacked_df.index)
calc_resids_applied_df = unstacked_df-smoother_applied_df
calc_sq_resids_applied_df = calc_resids_applied_df**2
# function to map the db scan
f = lambda x: cluster_residuals( np.array(x.fillna(0)).reshape(-1, 1) )
# apply the map to the dataframe
flagged_df = calc_sq_resids_applied_df.apply(f, axis=0)\
.apply(lambda x: (x==-1)*1 ).replace(0,np.nan)
# multiply the indicators for each smoothing method by the original time series for the target
outliers_ts = flagged_df*unstacked_df
midpoints_df["City"] = midpoints_df[["latitude", "longitude"]].apply(lambda x:\
search.by_coordinate(\
x[0]\
, x[1]\
, radius=30\
, returns=1)[0].City\
, axis=1)
midpoints_df["State"] = midpoints_df[["latitude", "longitude"]].apply(lambda x:\
search.by_coordinate(\
x[0]\
, x[1]\
, radius=30\
, returns=1)[0].State\
, axis=1)
midpoints_df["City_State"] =midpoints_df["City"]+", "+midpoints_df["State"]
cities_dict = midpoints_df.set_index("geo_cluster").to_dict("index")
str(cities_dict[50]["City_State"])
# outliers_ts
outliers_plot_df = outliers_ts.copy().reset_index()
outliers_plot_df["Y PRED"]=1
observed_plot_df = unstacked_df.copy().reset_index()
observed_plot_df["Y PRED"]=0
ts_cat_df = pd.concat([observed_plot_df, outliers_plot_df], axis=0)
ts_cat_df["local_datetime"]=ts_cat_df["local_datetime"].astype(str)
df =ts_cat_df
chart_data = cf.subplots(\
[df.figure(\
kind='scatter'
, mode='markers'
, x='local_datetime'
, y=target
, categories="Y PRED"
, colors=['rgb(0,0,0)', 'rgb(255,0,0)']
),
df.figure(\
kind='scatter'
, mode='markers'
, x='local_datetime'
, y=target_neighbors[0]
, categories="Y PRED"
, colors=['rgb(0,0,0)', 'rgb(255,0,0)']
),\
df.figure(\
kind='scatter'
, mode='markers'
, x='local_datetime'
, y=target_neighbors[1]
, categories="Y PRED"
, colors=['rgb(0,0,0)', 'rgb(255,0,0)']
),\
df.figure(\
kind='scatter'
, mode='markers'
, x='local_datetime'
, y=target_neighbors[2]
, categories="Y PRED"
, colors=['rgb(0,0,0)', 'rgb(255,0,0)']
)
],shape=(2,2), theme="white",subplot_titles=(
# "Cluster No. {0}".format(target)
# , "Cluster No. {0}".format(target_neighbors[0])
# , "Cluster No. {0}".format(target_neighbors[1])
# , "Cluster No. {0}".format(target_neighbors[2])
"Cluster No. {0}".format(str(cities_dict[target]["City_State"]))
, "Cluster No. {0}".format(str(cities_dict[target_neighbors[0]]["City_State"]))
, "Cluster No. {0}".format(str(cities_dict[target_neighbors[1]]["City_State"]))
, "Cluster No. {0}".format(str(cities_dict[target_neighbors[2]]["City_State"]))
))
chart_data['layout'].update(showlegend=False, plot_bgcolor='#FFFFFF', paper_bgcolor='#FFFFFF')
for i in range(0,8):
chart_data['data'][i]['marker'].update(size=5)
iplot(chart_data, config={"displaylogo":False, 'displayModeBar': False}, show_link=False)
cpt_prep_df = flagged_df[target_with_neighbors]
cpt_prep_df=pd.concat([flagged_df[target][1:],cpt_prep_df[target_neighbors].shift(1)],axis=1)
cpt_prep_df = flagged_df[target_with_neighbors].copy()
new_names = [str(i)+"_lag_1d" for i in cpt_prep_df.columns.values]
cpt_prep_df.columns=new_names
cpt_prep_df = flagged_df[target_with_neighbors]
cpt_prep_lag_df=cpt_prep_df.copy().shift(1)
new_names = [str(i)+"_lag_1d" for i in cpt_prep_df.columns.values]
cpt_prep_lag_df.columns=new_names
cpt_prep_df=pd.concat([cpt_prep_df[1:],cpt_prep_lag_df.shift(1)],axis=1)
cpt_counts = cpt_prep_df.fillna(0).groupby(['1_lag_1d', '16_lag_1d',
'50_lag_1d', '20_lag_1d', '24_lag_1d', '57_lag_1d', '58_lag_1d',
'62_lag_1d']).sum()#.reset_index()
cpt_df = cpt_counts/(len(cpt_prep_df)*1.00)
cpt_df
# resample to montly
resample_m_df = outliers_ts.resample("m").count()
resample_m_df[[20, 24, 57, 58, 62,50]].plot()
midpoints_df.head()
from uszipcode import ZipcodeSearchEngine
search = ZipcodeSearchEngine()
res = search.by_coordinate(39.122229, -77.133578, radius=30, returns=1)
res = search.by_coordinate(39.122229, -77.133578, radius=30, returns=1)
res[0]